We want to explore the restuarant data gerenally in rating, review, postcode, price, category and missing data.
library(tidyverse)
library(ggplot2)
food<-read.csv('food_processed.csv')
# plot rating distribution
## from the histgram we know that most of our resturant are rated as 3.0~4.0 and there is no resturant rated as 5. And it's a left-skewed data
ggplot(data = food, aes(food$Rating)) + geom_histogram(binwidth = 0.5)
From the histgram we know that most of our resturant are rated as 3.0~4.0 and there is no resturant rated as 5. And it’s a left-skewed data.
# plot postcode distribution
ggplot(data = food, aes(factor(food$zip_code))) + geom_bar()
From the histgram we can see that most of our resturant located in 10036 and 10019 district. The restuarant in 10011, 10012, 10033 might be outliers.
# plot price distribution
ggplot(data = food, aes(food$Price)) + geom_bar()
We can see that tha major resturants have price in 2 and no resturant in 5. Since our price data are dollar sign from yelp API. so the data in 2.5 might be a error data.
# plot review distribution
ggplot(data = food, aes(food$Review_Count)) + geom_histogram()
From the plot we can see that most of our restuarant review is below 1000. The higher the review is, the fewer resturant. And there are some outliers that have more than 1000r ratings.
## from this plot we can see that most common category are American, Deli, Italian, Pizza, and steakhouse.
ggplot(data = food,aes(food$Category_data)) +
geom_bar()+coord_flip()
From this plot we can see that, there are 35 categories in total and most common category are American, Deli, Italian, Pizza, and steakhouse.
ggplot(data = food,aes(food$Category_2nd_Level)) +
geom_bar()+coord_flip()
According to the second level category, we can see that North American, Deli and Europe are most common 3 category.
# plot rating distribution fill by district(postcode)
ggplot(data = food, aes(food$Rating, fill=factor(food$zip_code))) + geom_histogram(aes(x= food$Rating,y = (..count..)/sum(..count..)), breaks=c(0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0),
position = 'dodge') #+ geom_density(aes(y=..density..),alpha = .3)
We can see that the resturant’s rating in district 10036 has a peak around 3.5. And the resturant’s rating in distrct 10019 has a peak around 3.0
## For price, there is no much difference in 10019 and 10036, both of the districts have 2 price resturants.
#ggplot(data = food, aes(food$Price, fill=factor(food$Postcode))) + geom_histogram(aes(x= food$Price), binwidth = 1,position = 'dodge')
ggplot(data = food, aes(food$Price, fill=factor(food$zip_code))) + geom_histogram(aes(x= food$Price,y = ..density..),binwidth = 1,position = 'dodge')
# FIX ME LATER! category data need to be clean
# ### FIX ME: draw density of the plot
For price, there is no much huge difference in 10019 and 10036, both of the districts most have 2 price resturants. But there is no retuarant have 1 price in 10019.
And we can see that 10011, 10012, 10033 only have 1 retuarant in that district, so the density will always be 1.
## plot the review based on postcode.
ggplot(data = food, aes(food$Review_Count, fill=factor(food$zip_code))) + geom_histogram(position = 'dodge',binwidth = 50) + xlim(0, 800)
## we draw some boxplot to explore the data.
## we find that restuarnts in 10036 are more likely to have more reviews.
ggplot(food, aes(factor(food$zip_code), food$Review_Count)) +
geom_boxplot() +
coord_flip()
##
For review count, we find that restuarnts in 10036 are more likely to have more reviews. The restuarant in 10018 are more sparse.
## hex plot or scatter plot
ggplot(food,aes(x = Rating,y = Review_Count))+stat_bin_hex()
ggplot(food,aes(x = Rating,y = Review_Count))+geom_point(position = 'jitter', alpha = .3)
From the plot we can see that, 1. there is a cluster around rating 3~4 and review less that 500. 2. the more review, there is more likely to be high ratings. 3. There is no resturant have more than 600 review that has low ratings. 4. The resturant having over 1000 reviews are outliers and they are all rating 3.5 or 4.0
# we explore the relationship between price and review
ggplot(food,aes(x = Price,y = Review_Count))+stat_bin_hex()
ggplot(food,aes(x = Price,y = Review_Count))+geom_point(position = 'jitter', alpha = .3)
1.there is a cluster around Price 1-2 and review below 500. 2. Outliers are the points over 1000 reviews and over 3 ratings.
## mosiac plot
#library(vcd)
#data<-subset(food,food$zip_code %in% c(10036,10019,10018,10020))
#data<- subset(data,data$Price %in% c(1,2,3,4))
#counts2 <- data%>% drop_na(`zip_code`,`Price`)%>%group_by(data$`Price`, data$`zip_code`) %>%
# summarize(Freq =n())
#colnames(counts2)<-c('Price','zip_code','Freq')
#vcd::mosaic(Price~zip_code,direction = c('v','h'),counts2,rot_labels=c(0,90,0,0))
#data_new<-subset(food,food$zip_code %in% c(10036,10019,10018,10020))
#data_new<- subset(data_new,data_new$Category_2nd_Level %in% c('North American','Deli','Europe','Asian'))
#data_new<-subset(data_new,data_new$Category_data %in% c('American','Deli','Italian','Coffee'))
#try<-data_new%>%drop_na(`zip_code`,`Category_data`)%>%group_by(data_new$`Category_data`, data_new$`zip_code`) %>%
# summarize(Freq =n())
#counts3 <- data_new %>% drop_na(`zip_code`,`Category_2nd_Level`)%>%group_by(data_new$`Category_2nd_level`, data_new$`zip_code`) %>% summarize(Freq =n())
#colnames(try)<-c('Category_data','zip_code','Freq')
#vcd::mosaic(factor(Category_data)~zip_code,direction = c('v','h'),try,rot_labels=c(0,90,0,0))
data_new<-subset(food,food$zip_code %in% c(10036,10019,10018,10020))
data_new<- subset(data_new,data_new$Category_2nd_Level %in% c('North American','Deli','Europe','Asian'))
counts3 <- data_new %>% drop_na(`zip_code`,`Category_2nd_Level`)%>%group_by(data_new$`Category_2nd_Level`, data_new$`zip_code`) %>% summarize(Freq =n())
colnames(counts3)<-c('Category_data','zip_code','Freq')
vcd::mosaic(factor(Category_data)~zip_code,direction = c('v','h'),counts3,rot_labels=c(0,90,0,0))
From the plot we can see that the majority data lies in 10019 and 10036. And district will influence the retuarants category. We can see that there are more North American restuarant in 10036, and relatively less other types retuarants.
library(GGally)
library(ggplot2)
library(tidyverse)
parallel_data <-food[c(4,5,6,7,8,9,11)]
parallel_data$Category_data<-factor(parallel_data$Category_data)
#parallel_data$zip_code<-factor(parallel_data$zip_code)
parallel_data$Category_2nd_Level<-factor(parallel_data$Category_2nd_Level)
parallel_data$Price<-factor(parallel_data$Price)
#ggparcoord(parallel_data ,alphaLines = .7, groupColumn = 'Category_2nd_Level', scale = "uniminmax")+ylab('Data')+xlab('Indicator')
#ggparcoord(parallel_data ,alphaLines = .7, groupColumn = 'Category_2nd_Level', scale = "globalminmax")+ylab('Data')+xlab('Indicator')
#ggparcoord(parallel_data ,alphaLines = .7, groupColumn = 'Category_2nd_Level', scale = "robust")+ylab('Data')+xlab('Indicator')
#ggparcoord(parallel_data ,alphaLines = .7, groupColumn = 'Category_2nd_Level', scale = "std")+ylab('Data')+xlab('Indicator')
# See: http://www.buildingwidgets.com/blog/2015/1/30/week-04-interactive-parallel-coordinates-1
devtools::install_github("timelyportfolio/parcoords")
library(parcoords)
parallel_data<-subset(parallel_data,parallel_data$Review_Count%in%
c(1:1000))
#parallel_data$zip_code<-factor(parallel_data$zip_code)
parcoords(parallel_data
, rownames = F
, brushMode = "2D-strums"
, reorderable = T
, queue = T
, alpha = .5
, color = list(
colorBy = "Category_2nd_Level"
,colorScale = htmlwidgets::JS("d3.scale.category10()")
)
)
It’s obvious that there are 3 missing pattern in our data. The most common one is Price, Rating, Review count and Score lost. And the second is missing category data. The third one is missing them both. The first one is because we cant grab the data in yelp API.
library(tidyverse)
library(ggplot2)
library(dplyr)
library(tidyr)
library(DAAG)
food <- read.csv('food_processed.csv', header=T, na.strings=c("", "NA"))
row.names(food) <- food$ID
tidyfood <- food %>%
rownames_to_column("id") %>%
gather(key, value, -id) %>%
mutate(missing = ifelse(is.na(value), "yes", "no"))
ggplot(tidyfood, aes(x = key, y = fct_rev(id), fill = missing)) +
geom_tile(color = "white") +
ggtitle("food data with missing values") +
#scale_fill_viridis_d() + # discrete scale
theme_bw()
# There are some missing values in the food data set from the "NYC Open Data" and yelp, such as categories from the "NYC Open Data" and ratings, price levels, review counts from yelp. Our assumptions for those missing values are
food_pl <- as.data.frame(food[, c(2,4,5,6,7,8,9,10,11)])
food_pl$Score <- round(food_pl$Score)
food_pl$Rating <- round(food_pl$Rating)
food_pl[, 1:9] <- lapply(food_pl[, 1:9], factor)
food_plna <- na.omit(food_pl, cols=c("Score"))
colnames(food_plna)
## [1] "ID" "Category_data" "Category_2nd_Level"
## [4] "Rating" "Review_Count" "Price"
## [7] "Street_Num" "zip_code" "Score"
food_al <- food_plna %>%
#drop_na(Score) %>%
group_by(Category_2nd_Level, Rating, Price, Street_Num, zip_code, Review_Count,Category_data) %>% #
summarise(Freq = n())
#tidyfood2 <- food %>%
# group_by(Category_data, Category_2nd_Level, Rating, Review_Count, Street_Num, zip_code) %>%
# summarise(n = sum(ID))
#tidyfd <- food %>% rownames_to_column("Name") %>%
# gather(key = ID, value = , -Name, -School)
library(alluvial)
pal <- RColorBrewer::brewer.pal(10, "Set3")
alluvial(food_al[, c("Category_2nd_Level", "Rating", "Price", "zip_code")], freq = food_al$Freq,
blocks = TRUE,
alpha = 0.8,
col = pal[match(food_al$Category_2nd_Level,
unique(food_al$Category_2nd_Level)) ])